Risk Deep Dive: Detailed risk analysis¶

This notebook is built to work with the repo Make targets (e.g.make report-both YEARS="YYYY YYYY..." MONTHS="1 2 .. 12") It always prefers the current run (your summaries/<RUN_TAG>/ folder) when you pass YEARS/MONTHS, so the report reflects exactly what you ingested.

How to run (recommended)¶

From the repo root:

make all-both YEARS="YYYY YYYY..." MONTHS="1 2 .. 12"

The Makefile sets environment variables (e.g. `CITIBIKE_PARQUET_DIR`, `CITIBIKE_YEARS`, `CITIBIKE_MONTHS`) which this notebook reads.
In [1]:
# --- Setup (STRICT): load summaries + FORCE year/month risk tables (NO overall fallback) ---

from __future__ import annotations

from pathlib import Path
import os
import re
import pandas as pd
import matplotlib.pyplot as plt

# Notebook-friendly display
try:
    from IPython.display import display, Markdown
except Exception:
    display = print
    Markdown = lambda x: x

# Ensure figures render in executed notebook
try:
    get_ipython().run_line_magic("matplotlib", "inline")
except Exception:
    pass

plt.ioff()  # nbconvert-friendly

# ---------------------------
# Repo discovery
# ---------------------------
def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "Makefile").exists():
            return p
    raise FileNotFoundError(f"Could not find repo root (Makefile) from CWD={Path.cwd().resolve()}")

REPO_ROOT = find_repo_root(Path.cwd())
SUMMARIES_ROOT = REPO_ROOT / "summaries"

# ---------------------------
# Helpers
# ---------------------------
def _parse_int_list(val: str | None):
    if val is None:
        return None
    s = str(val).strip()
    if not s:
        return None
    parts = re.split(r"[,\s]+", s)
    out = []
    for p in parts:
        if not p:
            continue
        try:
            out.append(int(p))
        except Exception:
            pass
    return out or None

def read_csv_strict(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Missing required CSV: {path}")
    return pd.read_csv(path)

def read_csv_optional(path: Path) -> pd.DataFrame | None:
    return pd.read_csv(path) if path.exists() else None

def _filter_year_month(df: pd.DataFrame, years: list[int] | None, months: list[int] | None) -> pd.DataFrame:
    out = df.copy()
    if years is not None and "year" in out.columns:
        out["year"] = pd.to_numeric(out["year"], errors="coerce")
        out = out[out["year"].isin(years)]
    if months is not None and "month" in out.columns:
        out["month"] = pd.to_numeric(out["month"], errors="coerce")
        out = out[out["month"].isin(months)]
    return out

# ---------------------------
# Inputs from Makefile
# ---------------------------
PARQUET_DIR_ENV = (os.environ.get("CITIBIKE_PARQUET_DIR") or "").strip()
RUN_DIR_ENV     = (os.environ.get("CITIBIKE_RUN_DIR") or "").strip()
MODE_ENV        = (os.environ.get("CITIBIKE_MODE") or os.environ.get("MODE") or "").strip().lower()

YEARS_FILTER  = _parse_int_list(os.environ.get("CITIBIKE_YEARS")  or os.environ.get("YEARS"))
MONTHS_FILTER = _parse_int_list(os.environ.get("CITIBIKE_MONTHS") or os.environ.get("MONTHS"))

PARQUET_DIR = Path(PARQUET_DIR_ENV) if PARQUET_DIR_ENV else Path()

if RUN_DIR_ENV:
    RUN_DIR = Path(RUN_DIR_ENV)
else:
    run_tag = PARQUET_DIR.name if str(PARQUET_DIR).strip() else ""
    RUN_DIR = (SUMMARIES_ROOT / run_tag) if run_tag else Path()

# Resolve relative -> absolute
if str(RUN_DIR).strip() and not RUN_DIR.is_absolute():
    RUN_DIR = (REPO_ROOT / RUN_DIR).resolve()
if str(PARQUET_DIR).strip() and not PARQUET_DIR.is_absolute():
    PARQUET_DIR = (REPO_ROOT / PARQUET_DIR).resolve()

# ---------------------------
# Strict checks
# ---------------------------
if not SUMMARIES_ROOT.exists():
    raise FileNotFoundError(f"Expected summaries/ folder at: {SUMMARIES_ROOT}")
if not RUN_DIR.exists():
    raise FileNotFoundError(f"Expected run summaries at: {RUN_DIR}")

print("REPO_ROOT:", REPO_ROOT)
print("RUN_DIR:", RUN_DIR)
print("PARQUET_DIR:", PARQUET_DIR if str(PARQUET_DIR).strip() else "(not set)")
print("MODE (env):", MODE_ENV or "(not set)")
print("YEARS_FILTER:", YEARS_FILTER, "MONTHS_FILTER:", MONTHS_FILTER)

# Figures dir
FIG_DIR = REPO_ROOT / "reports" / RUN_DIR.name / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)
print("FIG_DIR:", FIG_DIR)

def savefig(filename: str):
    out = FIG_DIR / filename
    plt.savefig(out, dpi=200, bbox_inches="tight")
    print("Saved:", out)

# ---------------------------
# Load core per-run summaries (required)
# ---------------------------
df_year  = read_csv_strict(RUN_DIR / "citibike_trips_by_year.csv")
df_month = read_csv_strict(RUN_DIR / "citibike_trips_by_month.csv")
df_dow   = read_csv_strict(RUN_DIR / "citibike_trips_by_dow.csv")
df_hour  = read_csv_strict(RUN_DIR / "citibike_trips_by_hour.csv")

# Optional outputs
df_station = read_csv_optional(RUN_DIR / "citibike_station_exposure.csv")

# ---------------------------
# Load RISK (FORCE granular)
# ---------------------------
risk_year_path = RUN_DIR / "station_risk_exposure_plus_crashproximity_by_year.csv"
risk_ym_path   = RUN_DIR / "station_risk_exposure_plus_crashproximity_by_year_month.csv"

df_risk_year = read_csv_optional(risk_year_path)
df_risk_ym   = read_csv_optional(risk_ym_path)

# Force granular selection (no overall fallback)
if df_risk_ym is not None:
    df_risk = df_risk_ym
    risk_source = "by_year_month"
elif df_risk_year is not None:
    df_risk = df_risk_year
    risk_source = "by_year"
else:
    df_risk = None
    risk_source = "missing"
    raise FileNotFoundError(
        "No per-year/per-month risk CSVs found in RUN_DIR.\n"
        f"Expected one of:\n  - {risk_ym_path}\n  - {risk_year_path}\n"
        "If you truly want overall-only risk, load station_risk_exposure_plus_crashproximity.csv explicitly (but you said you don't)."
    )

# Highlights
highlights_path = RUN_DIR / "summary_highlights.md"

# Mode detection
mode = (
    str(df_year["mode"].iloc[0]).lower()
    if ("mode" in df_year.columns and len(df_year))
    else (MODE_ENV or "unknown")
)
print("Detected mode:", mode)

# Apply filters defensively
df_year  = _filter_year_month(df_year,  YEARS_FILTER, MONTHS_FILTER)
df_month = _filter_year_month(df_month, YEARS_FILTER, MONTHS_FILTER)
df_dow   = _filter_year_month(df_dow,   YEARS_FILTER, MONTHS_FILTER)
df_hour  = _filter_year_month(df_hour,  YEARS_FILTER, MONTHS_FILTER)

df_risk  = _filter_year_month(df_risk,  YEARS_FILTER, MONTHS_FILTER)

run_label = RUN_DIR.name

print("\nRisk files found:")
print(" - by_year:", "YES" if df_risk_year is not None else "NO")
print(" - by_year_month:", "YES" if df_risk_ym is not None else "NO")
print("Using df_risk =", risk_source)
print("df_risk columns:", list(df_risk.columns))
print("Unique years in df_risk:", sorted(pd.to_numeric(df_risk["year"], errors="coerce").dropna().unique().astype(int).tolist()) if "year" in df_risk.columns else "(no year)")
print("Unique months in df_risk:", sorted(pd.to_numeric(df_risk["month"], errors="coerce").dropna().unique().astype(int).tolist()) if "month" in df_risk.columns else "(no month)")
REPO_ROOT: /home/hamzeh-khanpour/Desktop/CITIBIK-main
RUN_DIR: /home/hamzeh-khanpour/Desktop/CITIBIK-main/summaries/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc
PARQUET_DIR: /home/hamzeh-khanpour/Desktop/CITIBIK-main/data/processed/citibike_parquet/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc
MODE (env): nyc
YEARS_FILTER: [2019, 2020, 2023, 2025] MONTHS_FILTER: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
FIG_DIR: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures
Detected mode: nyc

Risk files found:
 - by_year: YES
 - by_year_month: YES
Using df_risk = by_year_month
df_risk columns: ['mode', 'start_station_id', 'start_station_name', 'year', 'month', 'trips', 'start_trips', 'end_trips', 'touchpoints', 'station_lat', 'station_lng', 'crashes_within_500m', 'crashes_within_500m_per_100k_trips', 'risk_proxy_available', 'data_quality']
Unique years in df_risk: [2019, 2020, 2023, 2025]
Unique months in df_risk: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

Risk Distribution Overview¶

In [2]:
# --- HIGH-RISK STATIONS (nbconvert-safe, self-contained + year/month column on plot) ---

import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

MIN_TRIPS = 5000
TOP_N = 10
HIGH_RISK_PCT = 90  # top 10% by risk rate among credible rows

_SCORE_RE = re.compile(r"^axa_partner_scorecard_(\d+)m\.csv$")

def _available_scorecards(run_dir: Path) -> list[int]:
    radii = []
    for p in run_dir.glob("axa_partner_scorecard_*m.csv"):
        m = _SCORE_RE.match(p.name)
        if m:
            radii.append(int(m.group(1)))
    return sorted(set(radii))

def _pick_scorecard_path(run_dir: Path) -> Path | None:
    radii = _available_scorecards(run_dir)
    if not radii:
        return None
    # Stable default: prefer 500m if present; else max available
    chosen = 500 if 500 in radii else max(radii)
    p = run_dir / f"axa_partner_scorecard_{chosen}m.csv"
    return p if p.exists() else None

def _safe_read_csv(p: Path) -> pd.DataFrame | None:
    try:
        return pd.read_csv(p) if p and p.exists() else None
    except Exception as e:
        print(f"ERROR reading {p}: {e}")
        return None

# -----------------------------
# Ensure df_score exists (no NameError under nbconvert)
# -----------------------------
if "RUN_DIR" in globals() and RUN_DIR is not None:
    run_dir = Path(RUN_DIR)
else:
    run_dir = Path(".").resolve()

if "df_score" not in globals() or df_score is None or len(df_score) == 0:
    score_path = _pick_scorecard_path(run_dir)
    df_score = _safe_read_csv(score_path) if score_path else None

if df_score is None or len(df_score) == 0:
    print("  No scoring data found (df_score is missing/empty).")
    print("RUN_DIR used:", run_dir)
else:
    print("\n" + "=" * 70)
    print("HIGH-RISK STATIONS ANALYSIS")
    print("=" * 70)

    s = df_score.copy()

    # ---- Resolve column names robustly ----
    id_col   = "start_station_id"   if "start_station_id"   in s.columns else ("station_id"   if "station_id"   in s.columns else None)
    name_col = "start_station_name" if "start_station_name" in s.columns else ("station_name" if "station_name" in s.columns else None)
    lat_col  = "station_lat" if "station_lat" in s.columns else ("lat" if "lat" in s.columns else None)
    lng_col  = "station_lng" if "station_lng" in s.columns else ("lng" if "lng" in s.columns else None)

    exposure_col = "exposure_trips" if "exposure_trips" in s.columns else ("trips" if "trips" in s.columns else None)
    rate_col     = "eb_risk_rate_per_100k_trips" if "eb_risk_rate_per_100k_trips" in s.columns else (
                   "risk_rate_per_100k_trips" if "risk_rate_per_100k_trips" in s.columns else None
                 )
    crash_col    = "crash_count" if "crash_count" in s.columns else None
    cred_col     = "credibility_flag" if "credibility_flag" in s.columns else None

    # Helpful context columns (optional)
    has_year = "year" in s.columns
    has_month = "month" in s.columns

    missing_required = [k for k, v in {
        "station_id": id_col,
        "station_name": name_col,
        "exposure_trips": exposure_col,
        "risk_rate": rate_col,
    }.items() if v is None]

    if missing_required:
        print("Cannot run high-risk analysis; missing columns:", missing_required)
        print("Available columns:", list(s.columns))
    else:
        # ---- Clean numeric columns ----
        s[exposure_col] = pd.to_numeric(s[exposure_col], errors="coerce")
        s[rate_col]     = pd.to_numeric(s[rate_col], errors="coerce")
        if crash_col is not None:
            s[crash_col] = pd.to_numeric(s[crash_col], errors="coerce")

        if has_year:
            s["year"] = pd.to_numeric(s["year"], errors="coerce").astype("Int64")
        if has_month:
            s["month"] = pd.to_numeric(s["month"], errors="coerce").astype("Int64")

        # ---- Credible definition ----
        if cred_col is not None:
            credible = s[s[cred_col].astype(str).str.lower() == "credible"].copy()
        else:
            credible = s[s[exposure_col] >= MIN_TRIPS].copy()
            print(f"Note: no credibility_flag column; using exposure_trips ≥ {MIN_TRIPS:,} as credible.")

        credible = credible.dropna(subset=[exposure_col, rate_col]).copy()
        credible = credible[credible[exposure_col] >= MIN_TRIPS].copy()

        if len(credible) == 0:
            print("No credible rows with usable risk data.")
        else:
            # ---- Define high risk threshold on the credible distribution ----
            cutoff = float(np.nanpercentile(credible[rate_col].values, HIGH_RISK_PCT))
            high_risk = credible[credible[rate_col] >= cutoff].copy()

            print(f"High-risk threshold: {HIGH_RISK_PCT}th percentile of {rate_col} among credible rows")
            print(f"Cutoff value: {cutoff:.2f}")
            print(f"High-risk rows: {len(high_risk):,}")
            print(f"Total exposure in high-risk rows: {high_risk[exposure_col].sum():,.0f} trips")

            # ---- Show Top N by risk rate ----
            cols_to_show = [c for c in [
                "mode",
                "year" if has_year else None,
                "month" if has_month else None,
                id_col, name_col,
                lat_col, lng_col,
                exposure_col,
                crash_col,
                rate_col,
            ] if c and c in high_risk.columns]

            top_risk = (
                high_risk.sort_values([rate_col, exposure_col], ascending=False)
                         .head(TOP_N)[cols_to_show]
                         .copy()
            )

            print(f"\nTop {TOP_N} Highest-Risk (credible) station-periods:")
            display(top_risk)

            # ---- Print run scope so the plot is defensible ----
            if has_year and has_month:
                scope = (
                    s.dropna(subset=["year", "month"])
                     .groupby(["year", "month"])
                     .size()
                     .reset_index(name="rows")
                     .sort_values(["year", "month"])
                )
                print("\nScorecard scope (rows by year-month):")
                # nbconvert-safe print (display may not render)
                print(scope.to_string(index=False))

            # ---- Plot Top N with a Year/Month column on the right ----
            try:
                top_plot = top_risk.sort_values(rate_col, ascending=True).copy()

                fig, ax = plt.subplots(figsize=(12, 6))
                y = np.arange(len(top_plot))
                vals = top_plot[rate_col].values

                ax.barh(y, vals, edgecolor="black")

                # Left labels (station name)
                ax.set_yticks(y)
                ax.set_yticklabels([str(n)[:40] for n in top_plot[name_col]], fontsize=9)

                ax.set_xlabel("Risk rate per 100k trips", fontsize=11)
                ax.set_title(f"Top {TOP_N} Highest-Risk Station-Periods (Credible)", fontsize=12, fontweight="bold")
                ax.grid(axis="x", alpha=0.3)

                # Create space for right-side column
                x_max = float(np.nanmax(vals)) if len(vals) else 0.0
                pad = 0.18 * x_max if x_max > 0 else 1.0
                ax.set_xlim(0, x_max + pad)

                # Right column positions
                x_sep = x_max + pad * 0.08
                x_year = x_max + pad * 0.35
                x_month = x_max + pad * 0.70

                # Separator line
                ax.axvline(x_sep, color="gray", linewidth=1, alpha=0.6)

                # Headers + values
                if has_year and has_month:
                    ax.text(x_year, len(y) - 0.15, "Year", ha="center", va="bottom",
                            fontsize=10, fontweight="bold")
                    ax.text(x_month, len(y) - 0.15, "Month", ha="center", va="bottom",
                            fontsize=10, fontweight="bold")

                    for i, (_, row) in enumerate(top_plot.iterrows()):
                        yr = row.get("year", pd.NA)
                        mo = row.get("month", pd.NA)
                        yr_txt = "" if pd.isna(yr) else str(int(yr))
                        mo_txt = "" if pd.isna(mo) else str(int(mo)).zfill(2)
                        ax.text(x_year, i, yr_txt, ha="center", va="center", fontsize=9)
                        ax.text(x_month, i, mo_txt, ha="center", va="center", fontsize=9)

                plt.tight_layout()
                savefig("top10_highest_risk_station_periods.png")
                plt.show()
                print("✓ Saved plot: top10_highest_risk_station_periods.png")

            except Exception as e:
                print(f"Could not generate plot: {e}")
======================================================================
HIGH-RISK STATIONS ANALYSIS
======================================================================
High-risk threshold: 90th percentile of eb_risk_rate_per_100k_trips among credible rows
Cutoff value: 628.72
High-risk rows: 1,646
Total exposure in high-risk rows: 12,567,625 trips

Top 10 Highest-Risk (credible) station-periods:
mode year month start_station_id start_station_name station_lat station_lng exposure_trips crash_count eb_risk_rate_per_100k_trips
152 nyc 2019 6 3734 E 58 St & 1 Ave (NW Corner) 40.759125 -73.962658 5008 116 2068.172197
150 nyc 2019 2 529.0 W 42 St & 8 Ave 40.757570 -73.990985 5098 109 2035.034813
84 nyc 2019 5 3712 W 35 St & Dyer Ave 40.754692 -73.997402 5486 120 1999.473641
184 nyc 2019 5 3749 Lexington Ave & E 36 St 40.747574 -73.978801 5108 112 1992.237399
141 nyc 2019 2 401.0 Allen St & Rivington St 40.720196 -73.989978 5270 109 1979.209296
75 nyc 2019 10 464 E 56 St & 3 Ave 40.759345 -73.967597 5731 122 1934.592498
87 nyc 2019 12 529 W 42 St & 8 Ave 40.757570 -73.990985 5672 112 1931.177105
237 nyc 2019 11 3236 W 42 St & Dyer Ave 40.758985 -73.993800 5081 106 1926.525165
188 nyc 2019 1 529.0 W 42 St & 8 Ave 40.757570 -73.990985 5319 105 1907.926813
203 nyc 2019 2 531.0 Forsyth St & Broome St 40.718939 -73.992663 5339 104 1878.788813
Scorecard scope (rows by year-month):
 year  month  rows
 2019      1   769
 2019      2   767
 2019      3   770
 2019      4   786
 2019      5   796
 2019      6   795
 2019      7   790
 2019      8   795
 2019      9   808
 2019     10   840
 2019     11   870
 2019     12   881
 2020      1   917
 2020      2   908
 2020      3   904
 2020      4   900
 2020      5   940
 2020      6   978
 2020      7  1008
 2020      8  1055
 2020      9  1106
 2020     10  1162
 2020     11  1166
 2020     12  1189
 2023      1  1817
 2023      2  1872
 2023      3  1898
 2023      4  1912
 2023      5  1910
 2023      6  1923
 2023      7  1964
 2023      8  2014
 2023      9  2060
 2023     10  2180
 2023     11  2103
 2023     12  2202
 2025      1  2252
 2025      2  2244
 2025      3  2247
 2025      4  2247
 2025      5  2245
 2025      6  2207
 2025      7  2206
 2025      8  2198
 2025      9  2213
 2025     10  2203
 2025     11  2222
 2025     12  2227
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/top10_highest_risk_station_periods.png
No description has been provided for this image
✓ Saved plot: top10_highest_risk_station_periods.png
In [3]:
import os
import re
import math
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------------
# Helpers: parse env selections
# -----------------------------
def _parse_int_list_env(name: str):
    raw = os.environ.get(name, "").strip()
    if not raw:
        return None
    parts = re.split(r"[,\s]+", raw)
    out = []
    for p in parts:
        p = p.strip()
        if not p:
            continue
        try:
            out.append(int(p))
        except ValueError:
            pass
    return out or None

MODE_ENV = (os.environ.get("CITIBIKE_MODE", "") or "unknown").strip().lower()
years_env = _parse_int_list_env("CITIBIKE_YEARS")
months_env = _parse_int_list_env("CITIBIKE_MONTHS")

# -----------------------------
# Column choices
# -----------------------------
exposure_col = "exposure_trips"
rate_col = "eb_risk_rate_per_100k_trips"
crash_col = "crash_count" if "crash_count" in df_score.columns else None

needed = [exposure_col, rate_col]
missing = [c for c in needed if c not in df_score.columns]
if missing:
    raise ValueError(f"df_score missing required columns: {missing}")

plot_df = df_score.copy()

# Optional: only credible rows
if "credibility_flag" in plot_df.columns:
    plot_df = plot_df[plot_df["credibility_flag"] == "credible"].copy()

# Clean types
plot_df[exposure_col] = pd.to_numeric(plot_df[exposure_col], errors="coerce")
plot_df[rate_col] = pd.to_numeric(plot_df[rate_col], errors="coerce")
if crash_col is not None:
    plot_df[crash_col] = pd.to_numeric(plot_df[crash_col], errors="coerce").fillna(0)

plot_df = plot_df.dropna(subset=[exposure_col, rate_col])
plot_df = plot_df[plot_df[exposure_col] > 0].copy()

has_period_cols = ("year" in plot_df.columns) and ("month" in plot_df.columns)

# -----------------------------
# Styling helpers (matches your snippet)
# -----------------------------
def _draw_scatter(ax, df_slice: pd.DataFrame):
    if crash_col is not None:
        sc = ax.scatter(
            df_slice[exposure_col],
            df_slice[rate_col],
            c=df_slice[crash_col].fillna(0),
            alpha=0.6,
            s=30,  # a bit smaller for dense multi-panels
        )
        return sc
    else:
        ax.scatter(
            df_slice[exposure_col],
            df_slice[rate_col],
            alpha=0.6,
            s=30,
        )
        return None

def _mark_hotspots(ax, df_slice: pd.DataFrame):
    if "prevention_hotspot" in df_slice.columns:
        hotspots = df_slice[df_slice["prevention_hotspot"] == True]
        if len(hotspots) > 0:
            ax.scatter(
                hotspots[exposure_col],
                hotspots[rate_col],
                color="red", s=130, marker="*",
                edgecolors="black", linewidths=1,
                label="Prevention Hotspots", zorder=5
            )
            ax.legend(loc="best", fontsize=7)

def _style_axes(ax, title: str):
    ax.set_xlabel("Exposure (trips)", fontsize=9)
    ax.set_ylabel("EB Risk Rate (per 100k trips)", fontsize=9)
    ax.set_title(title, fontsize=10, fontweight="bold")
    ax.set_xscale("log")
    ax.grid(alpha=0.3)

# -----------------------------
# If no year/month: do NOT crash (keeps all-both working)
# -----------------------------
if not has_period_cols:
    print(f"[panel] No year/month columns for MODE={MODE_ENV}. Making overall plot only.")

    fig, ax = plt.subplots(figsize=(7.2, 4.8))
    sc = _draw_scatter(ax, plot_df)
    _mark_hotspots(ax, plot_df)
    _style_axes(ax, f"Risk vs Exposure (overall) — mode={MODE_ENV}")

    if sc is not None:
        plt.colorbar(sc, ax=ax, label="Crash Count")

    savefig(f"risk_vs_exposure_overall_mode{MODE_ENV}.png")
    plt.show()

else:
    # -----------------------------
    # Prepare year/month selections (no hardcoding)
    # -----------------------------
    plot_df["year"] = pd.to_numeric(plot_df["year"], errors="coerce")
    plot_df["month"] = pd.to_numeric(plot_df["month"], errors="coerce")
    plot_df = plot_df.dropna(subset=["year", "month"]).copy()
    plot_df["year"] = plot_df["year"].astype(int)
    plot_df["month"] = plot_df["month"].astype(int)

    years_available = sorted(plot_df["year"].unique())
    months_available = sorted(plot_df["month"].unique())

    years_to_plot = years_env if years_env is not None else years_available
    months_to_plot = months_env if months_env is not None else months_available

    years_to_plot = [y for y in years_to_plot if y in years_available]
    months_to_plot = [m for m in months_to_plot if m in months_available]

    if not years_to_plot or not months_to_plot:
        print(f"[panel] No data after filtering for MODE={MODE_ENV}. years_env={years_env}, months_env={months_env}")

    else:
        print(f"[panel] MODE={MODE_ENV} years={years_to_plot} months={months_to_plot} rows={len(plot_df):,}")

        # =========================================================
        # PANEL A (paginated): one subplot per (year, month)
        # =========================================================
        pairs = [(y, m) for y in years_to_plot for m in months_to_plot]
        total = len(pairs)

        # Pagination settings (good for up to ~48+ plots)
        PANELS_PER_PAGE = 12  # 12 => 3x4 (readable)
        NCOLS = 3
        NROWS = int(math.ceil(PANELS_PER_PAGE / NCOLS))
        pages = int(math.ceil(total / PANELS_PER_PAGE))

        print(f"[panelA] Total period panels={total} -> pages={pages} (PANELS_PER_PAGE={PANELS_PER_PAGE})")

        # For a stable shared color scale across pages (so colors are comparable),
        # compute global vmin/vmax once.
        vmin = vmax = None
        if crash_col is not None and len(plot_df) > 0:
            vmin = float(plot_df[crash_col].min())
            vmax = float(plot_df[crash_col].max())

        for page_idx in range(pages):
            start = page_idx * PANELS_PER_PAGE
            end = min((page_idx + 1) * PANELS_PER_PAGE, total)
            page_pairs = pairs[start:end]

            figA, axesA = plt.subplots(
                NROWS, NCOLS,
                figsize=(4.6 * NCOLS, 3.6 * NROWS),
                squeeze=False
            )

            scatter_for_cbar = None

            for i, (y, m) in enumerate(page_pairs):
                r, c = divmod(i, NCOLS)
                ax = axesA[r][c]

                d = plot_df[(plot_df["year"] == y) & (plot_df["month"] == m)].copy()

                if crash_col is not None:
                    sc = ax.scatter(
                        d[exposure_col],
                        d[rate_col],
                        c=d[crash_col].fillna(0),
                        alpha=0.6,
                        s=30,
                        vmin=vmin,
                        vmax=vmax,
                    )
                else:
                    sc = ax.scatter(
                        d[exposure_col],
                        d[rate_col],
                        alpha=0.6,
                        s=30,
                    )

                _mark_hotspots(ax, d)
                _style_axes(ax, f"{y}-{m:02d} (mode={MODE_ENV})")

                if scatter_for_cbar is None and crash_col is not None:
                    scatter_for_cbar = sc

            # Turn off unused axes on last page
            for j in range(len(page_pairs), NROWS * NCOLS):
                r, c = divmod(j, NCOLS)
                axesA[r][c].axis("off")

            figA.suptitle(
                f"Risk vs Exposure — mode={MODE_ENV} (page {page_idx+1}/{pages})",
                fontsize=13, fontweight="bold"
            )
            figA.tight_layout(rect=[0.0, 0.0, 1.0, 0.93])

            # Shared colorbar that matches the axes block height (NOT the full figure)
            if scatter_for_cbar is not None:
                import matplotlib as mpl
                sm = mpl.cm.ScalarMappable(norm=scatter_for_cbar.norm, cmap=scatter_for_cbar.cmap)
                sm.set_array([])
                visible_axes = [ax for ax in axesA.ravel() if ax.get_visible() and ax.has_data()]
                cbar = figA.colorbar(sm, ax=visible_axes, fraction=0.035, pad=0.02)
                cbar.set_label("Crash Count")

            savefig(f"risk_vs_exposure_panel_periods_mode{MODE_ENV}_page{page_idx+1:02d}.png")
            plt.show()

        # =========================================================
        # PANEL B: YEARLY COMPARISON (overlay years) — one subplot per month
        # =========================================================
        # This scales well: one subplot per month, overlay each year.
        n = len(months_to_plot)
        ncols = min(3, n) if n > 1 else 1
        nrows = int(math.ceil(n / ncols))

        figB, axesB = plt.subplots(
            nrows, ncols,
            figsize=(4.6 * ncols, 3.6 * nrows),
            squeeze=False
        )

        # If too many years, auto-switch to endpoints to keep readable
        if len(years_to_plot) > 3:
            years_overlay = [min(years_to_plot), max(years_to_plot)]
            overlay_tag = f"endpoints_{years_overlay[0]}_{years_overlay[1]}"
        else:
            years_overlay = list(years_to_plot)
            overlay_tag = "allYears"

        for i, m in enumerate(months_to_plot):
            r, c = divmod(i, ncols)
            ax = axesB[r][c]

            any_data = False
            for y in years_overlay:
                d = plot_df[(plot_df["year"] == y) & (plot_df["month"] == m)].copy()
                if d.empty:
                    continue
                any_data = True
                ax.scatter(
                    d[exposure_col],
                    d[rate_col],
                    alpha=0.5,
                    s=22,
                    label=str(y)
                )
                _mark_hotspots(ax, d)

            _style_axes(ax, f"Year comparison — month {m:02d} (mode={MODE_ENV})")
            if any_data:
                ax.legend(title="Year", fontsize=8, title_fontsize=9, loc="best")
            else:
                ax.text(0.5, 0.5, "No data", transform=ax.transAxes, ha="center", va="center")
                ax.set_axis_off()

        # Turn off unused axes
        for j in range(n, nrows * ncols):
            r, c = divmod(j, ncols)
            axesB[r][c].axis("off")

        figB.suptitle(
            f"Yearly comparison (overlay: {overlay_tag}) — mode={MODE_ENV}",
            fontsize=13, fontweight="bold"
        )
        figB.tight_layout(rect=[0.0, 0.0, 1.0, 0.93])

        savefig(f"risk_vs_exposure_panel_year_compare_mode{MODE_ENV}_{overlay_tag}.png")
        plt.show()
[panel] MODE=nyc years=[2019, 2020, 2023, 2025] months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] rows=16,454
[panelA] Total period panels=48 -> pages=4 (PANELS_PER_PAGE=12)
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page01.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page02.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page03.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_periods_modenyc_page04.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/risk_vs_exposure_panel_year_compare_modenyc_endpoints_2019_2025.png
No description has been provided for this image
In [4]:
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------------
# Helpers: parse env selections
# -----------------------------
def _parse_int_list_env(name: str):
    raw = os.environ.get(name, "").strip()
    if not raw:
        return None
    parts = re.split(r"[,\s]+", raw)
    out = []
    for p in parts:
        p = p.strip()
        if not p:
            continue
        try:
            out.append(int(p))
        except ValueError:
            pass
    return out or None

MODE_ENV = (os.environ.get("CITIBIKE_MODE", "") or "unknown").strip().lower()
years_env = _parse_int_list_env("CITIBIKE_YEARS")
months_env = _parse_int_list_env("CITIBIKE_MONTHS")

# ========================================================================
# CHECK: Does df_score have year/month columns?
# ========================================================================
needed = ["year", "month"]
missing = [c for c in needed if c not in df_score.columns]

if missing:
    print("="*80)
    print("  SEASONALITY ANALYSIS SKIPPED")
    print("="*80)
    print(f"Reason: df_score missing required columns: {missing}")
    print(f"Mode: {MODE_ENV}")
    print("")
    print("This is expected for JC mode (no crash data with temporal granularity).")
    print("Seasonality analysis requires year/month breakdown, which only exists for NYC.")
    print("")
    print("Available columns in df_score:")
    print(list(df_score.columns))
    print("="*80)
else:
    # ========================================================================
    # PROCEED WITH SEASONALITY ANALYSIS
    # ========================================================================
    print(f" df_score has year/month columns → proceeding with seasonality analysis")
    
    # -----------------------------
    # What to plot (time-comparable)
    # -----------------------------
    # Use EB rate for time comparison (risk_index_pct is within-period and not ideal across time)
    value_col = "eb_risk_rate_per_100k_trips"
    if value_col not in df_score.columns:
        if "risk_rate_per_100k_trips" in df_score.columns:
            value_col = "risk_rate_per_100k_trips"
        else:
            raise ValueError(
                "Need eb_risk_rate_per_100k_trips (or risk_rate_per_100k_trips) in df_score."
            )

    df = df_score.copy()

    # Optional: credible only
    if "credibility_flag" in df.columns:
        df = df[df["credibility_flag"] == "credible"].copy()

    # Clean
    df["year"] = pd.to_numeric(df["year"], errors="coerce")
    df["month"] = pd.to_numeric(df["month"], errors="coerce")
    df[value_col] = pd.to_numeric(df[value_col], errors="coerce")

    df = df.dropna(subset=["year", "month", value_col]).copy()
    df["year"] = df["year"].astype(int)
    df["month"] = df["month"].astype(int)

    # Apply env filters (no hardcoding)
    years_available = sorted(df["year"].unique())
    months_available = sorted(df["month"].unique())

    years_to_use = years_env if years_env is not None else years_available
    months_to_use = months_env if months_env is not None else months_available

    years_to_use = [y for y in years_to_use if y in years_available]
    months_to_use = [m for m in months_to_use if m in months_available]

    df = df[df["year"].isin(years_to_use) & df["month"].isin(months_to_use)].copy()
    if df.empty:
        raise ValueError(f"No rows after filtering. years={years_to_use} months={months_to_use} mode={MODE_ENV}")

    print(f"[seasonality] mode={MODE_ENV} years={years_to_use} months={months_to_use} rows={len(df):,} value={value_col}")

    # -------------------------------------------------------------------
    # Summary stats per (year, month): median + IQR
    # -------------------------------------------------------------------
    g = df.groupby(["year", "month"])[value_col]
    summary = g.quantile([0.25, 0.5, 0.75]).unstack(level=-1).reset_index()
    summary.columns = ["year", "month", "q25", "q50", "q75"]

    # -------------------------------------------------------------------
    # Plot 1: Seasonal lines (median ± IQR) per year
    # -------------------------------------------------------------------
    fig1, ax1 = plt.subplots(figsize=(10, 5.5))

    for y in years_to_use:
        s = summary[summary["year"] == y].sort_values("month")
        if s.empty:
            continue
        ax1.plot(s["month"], s["q50"], marker="o", linewidth=2, label=str(y))
        ax1.fill_between(s["month"], s["q25"], s["q75"], alpha=0.15)

    ax1.set_title(f"Seasonality of station risk by month (median ± IQR) — mode={MODE_ENV}", fontweight="bold")
    ax1.set_xlabel("Month")
    ax1.set_ylabel(f"{value_col}")
    ax1.set_xticks(sorted(months_to_use))
    ax1.grid(alpha=0.3)
    ax1.legend(
        title="Year",
        ncol=min(6, max(1, len(years_to_use))),
        fontsize=9,
        title_fontsize=10
    )

    savefig(f"seasonality_lines_median_IQR_mode{MODE_ENV}.png")
    plt.show()

    # -------------------------------------------------------------------
    # Plot 2: Heatmap (Year × Month) of median risk with readable labels
    # -------------------------------------------------------------------
    pivot = (
        summary.pivot(index="year", columns="month", values="q50")
        .reindex(index=years_to_use, columns=sorted(months_to_use))
    )

    fig2, ax2 = plt.subplots(figsize=(10, 3.0 + 0.5 * len(years_to_use)))

    # More readable colormap + stable perception
    im = ax2.imshow(pivot.values, aspect="auto", cmap="cividis")

    ax2.set_title(f"Year × Month median station risk — mode={MODE_ENV}", fontweight="bold")
    ax2.set_xlabel("Month")
    ax2.set_ylabel("Year")

    ax2.set_xticks(range(len(pivot.columns)))
    ax2.set_xticklabels([f"{m:02d}" for m in pivot.columns])
    ax2.set_yticks(range(len(pivot.index)))
    ax2.set_yticklabels([str(y) for y in pivot.index])

    cbar = fig2.colorbar(im, ax=ax2, fraction=0.03, pad=0.02)
    cbar.set_label(f"Median {value_col}")

    # Adaptive text color + subtle bbox so numbers are readable on dark cells
    norm = im.norm
    cmap = im.cmap
    for i, y in enumerate(pivot.index):
        for j, m in enumerate(pivot.columns):
            val = pivot.loc[y, m]
            if pd.isna(val):
                continue

            rgba = cmap(norm(val))
            luminance = 0.299 * rgba[0] + 0.587 * rgba[1] + 0.114 * rgba[2]
            txt_color = "black" if luminance > 0.6 else "white"

            ax2.text(
                j, i, f"{val:.0f}",
                ha="center", va="center",
                fontsize=9, fontweight="bold",
                color=txt_color,
                bbox=dict(
                    boxstyle="round,pad=0.15",
                    facecolor=("white" if txt_color == "black" else "black"),
                    alpha=0.20,
                    edgecolor="none"
                )
            )

    fig2.tight_layout()
    savefig(f"seasonality_heatmap_year_month_mode{MODE_ENV}.png")
    plt.show()
 df_score has year/month columns → proceeding with seasonality analysis
[seasonality] mode=nyc years=[2019, 2020, 2023, 2025] months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] rows=16,454 value=eb_risk_rate_per_100k_trips
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/seasonality_lines_median_IQR_modenyc.png
No description has been provided for this image
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/seasonality_heatmap_year_month_modenyc.png
No description has been provided for this image
In [5]:
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

def _parse_int_list_env(name: str):
    raw = os.environ.get(name, "").strip()
    if not raw:
        return None
    parts = re.split(r"[,\s]+", raw)
    out = []
    for p in parts:
        p = p.strip()
        if not p:
            continue
        try:
            out.append(int(p))
        except ValueError:
            pass
    return out or None

MODE_ENV = (os.environ.get("CITIBIKE_MODE", "") or "unknown").strip().lower()
years_env = _parse_int_list_env("CITIBIKE_YEARS")
months_env = _parse_int_list_env("CITIBIKE_MONTHS")

rate_col = "eb_risk_rate_per_100k_trips"
exp_col = "exposure_trips"
id_col = "start_station_id" if "start_station_id" in df_score.columns else None

needed = [rate_col, exp_col]
missing = [c for c in needed if c not in df_score.columns]
if missing:
    raise ValueError(f"df_score missing required columns: {missing}")

df = df_score.copy()

# Optional: credible only (keeps distributions stable)
if "credibility_flag" in df.columns:
    df = df[df["credibility_flag"] == "credible"].copy()

# Clean
df[rate_col] = pd.to_numeric(df[rate_col], errors="coerce")
df[exp_col] = pd.to_numeric(df[exp_col], errors="coerce")
df = df.dropna(subset=[rate_col, exp_col])
df = df[df[exp_col] > 0].copy()

has_period_cols = ("year" in df.columns) and ("month" in df.columns)

if not has_period_cols:
    print(f"[yearly-plots] No year/month columns for MODE={MODE_ENV}. Skipping yearly comparison plots.")
else:
    df["year"] = pd.to_numeric(df["year"], errors="coerce")
    df["month"] = pd.to_numeric(df["month"], errors="coerce")
    df = df.dropna(subset=["year", "month"]).copy()
    df["year"] = df["year"].astype(int)
    df["month"] = df["month"].astype(int)

    years_available = sorted(df["year"].unique())
    months_available = sorted(df["month"].unique())

    years_to_use = years_env if years_env is not None else years_available
    months_to_use = months_env if months_env is not None else months_available
    years_to_use = [y for y in years_to_use if y in years_available]
    months_to_use = [m for m in months_to_use if m in months_available]

    # Filter to chosen months/years
    df = df[df["year"].isin(years_to_use) & df["month"].isin(months_to_use)].copy()

    if df.empty:
        print(f"[yearly-plots] Empty after filtering. years={years_to_use} months={months_to_use} MODE={MODE_ENV}")
    else:
        print(f"[yearly-plots] MODE={MODE_ENV} years={years_to_use} months={months_to_use} rows={len(df):,}")

        # =========================================================
        # Plot A: Distribution by year (boxplot of station-period EB rates)
        # =========================================================
        figA, axA = plt.subplots(figsize=(8.5, 4.8))

        data_by_year = [df.loc[df["year"] == y, rate_col].values for y in years_to_use]

        # Use tick_labels (new matplotlib) + style median line explicitly
        bp = axA.boxplot(
            data_by_year,
            tick_labels=[str(y) for y in years_to_use],
            showfliers=False,
            medianprops=dict(color="red", linewidth=2),
        )

        axA.set_title(f"Distribution of EB Risk Rate by Year — mode={MODE_ENV}", fontweight="bold")
        axA.set_xlabel("Year")
        axA.set_ylabel("EB Risk Rate (per 100k trips)")
        axA.grid(alpha=0.3)

        # Legend entry to explain the horizontal line is the median
        median_handle = mlines.Line2D([], [], color="red", linewidth=2, label="Median")
        axA.legend(handles=[median_handle], loc="best")

        # # Optional: also add a small in-plot annotation (comment out if you prefer legend only)
        # axA.text(
        #     0.98, 0.98,
        #     "Red line = median",
        #     transform=axA.transAxes,
        #     ha="right", va="top",
        #     fontsize=10,
        #     bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.85, edgecolor="gray")
        # )

        savefig(f"yearly_distribution_boxplot_mode{MODE_ENV}.png")
        plt.show()

        # =========================================================
        # Plot B: Station-level change (endpoints slopegraph)
        # =========================================================
        if id_col is None:
            print("[yearly-plots] No start_station_id column; skipping station-level change plot.")
        else:
            # Aggregate within each year across selected months:
            # exposure-weighted average EB rate (stable)
            def _agg_one(grp: pd.DataFrame) -> pd.Series:
                w = grp[exp_col].to_numpy()
                x = grp[rate_col].to_numpy()
                wsum = float(np.sum(w))
                if wsum <= 0 or len(x) == 0:
                    return pd.Series({"exposure_sum": 0.0, "eb_rate_weighted": np.nan})
                return pd.Series({"exposure_sum": wsum, "eb_rate_weighted": float(np.average(x, weights=w))})

            g = (
                df.groupby([id_col, "year"], as_index=False)
                  .apply(_agg_one)
                  .reset_index(drop=True)
            )

            y0, y1 = min(years_to_use), max(years_to_use)

            gg0 = g[g["year"] == y0][[id_col, "exposure_sum", "eb_rate_weighted"]].rename(
                columns={"exposure_sum": "exp0", "eb_rate_weighted": "rate0"}
            )
            gg1 = g[g["year"] == y1][[id_col, "exposure_sum", "eb_rate_weighted"]].rename(
                columns={"exposure_sum": "exp1", "eb_rate_weighted": "rate1"}
            )
            merged = gg0.merge(gg1, on=id_col, how="inner").dropna(subset=["rate0", "rate1"]).copy()

            if merged.empty:
                print(f"[yearly-plots] No stations present in BOTH {y0} and {y1}; skipping slopegraph.")
            else:
                merged["exp_total"] = merged["exp0"] + merged["exp1"]
                TOP_N = 120
                merged = merged.sort_values("exp_total", ascending=False).head(TOP_N).copy()

                figB, axB = plt.subplots(figsize=(8.5, 5.5))

                x_positions = [0, 1]
                for _, r in merged.iterrows():
                    axB.plot(
                        x_positions,
                        [r["rate0"], r["rate1"]],
                        alpha=0.25,
                        linewidth=1
                    )

                # Median line for context
                axB.plot(
                    x_positions,
                    [merged["rate0"].median(), merged["rate1"].median()],
                    linewidth=3,
                    label="Median (top exposure)"
                )

                axB.set_xticks(x_positions)
                axB.set_xticklabels([str(y0), str(y1)])
                axB.set_title(
                    f"Station-level EB Risk Change (Exposure-weighted) — months={months_to_use} — mode={MODE_ENV}",
                    fontweight="bold"
                )
                axB.set_ylabel("EB Risk Rate (per 100k trips)")
                axB.grid(alpha=0.3)
                axB.legend()

                savefig(f"station_change_slopegraph_{y0}_to_{y1}_mode{MODE_ENV}.png")
                plt.show()
[yearly-plots] MODE=nyc years=[2019, 2020, 2023, 2025] months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] rows=16,454
Saved: /home/hamzeh-khanpour/Desktop/CITIBIK-main/reports/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/figures/yearly_distribution_boxplot_modenyc.png
No description has been provided for this image
[yearly-plots] No stations present in BOTH 2019 and 2025; skipping slopegraph.
/tmp/ipykernel_7412/2327989950.py:130: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(_agg_one)
In [ ]:
 

Executive Risk Summary¶

In [6]:
# --- EXECUTIVE RISK SUMMARY (nbconvert-safe, self-contained) ---

from pathlib import Path
import pandas as pd
import numpy as np

MIN_TRIPS = 5000

# Make sure df_score exists (avoid NameError)
if "df_score" not in globals() or df_score is None or len(df_score) == 0:
    print("  df_score is missing/empty — cannot build executive summary.")
else:
    print("\n" + "="*70)
    print("EXECUTIVE RISK SUMMARY FOR AXA")
    print("="*70)

    s = df_score.copy()

    # Defensive: ensure credibility_flag exists
    if "credibility_flag" not in s.columns:
        if "exposure_trips" in s.columns:
            s["credibility_flag"] = np.where(s["exposure_trips"] >= MIN_TRIPS, "credible", "insufficient_data")
        elif "trips" in s.columns:
            s["credibility_flag"] = np.where(s["trips"] >= MIN_TRIPS, "credible", "insufficient_data")
        else:
            s["credibility_flag"] = "unknown"

    # Decide exposure column
    exposure_col = "exposure_trips" if "exposure_trips" in s.columns else ("trips" if "trips" in s.columns else None)
    if exposure_col is not None:
        s[exposure_col] = pd.to_numeric(s[exposure_col], errors="coerce").fillna(0)

    # Infer mode label safely
    mode_label = "unknown"
    if "mode" in s.columns and s["mode"].notna().any():
        uniq_modes = sorted(s["mode"].astype(str).unique().tolist())
        mode_label = uniq_modes[0] if len(uniq_modes) == 1 else "multiple"

    # Build credible_stations HERE (fixes your NameError)
    credible_stations = s[s["credibility_flag"].astype(str).str.lower() == "credible"].copy()

    total_rows = len(s)
    credible_count = len(credible_stations)

    summary_data = []

    summary_data.append({
        "Metric": "Total station-period rows",
        "Value": f"{total_rows:,}",
        "Notes": f"mode={mode_label}"
    })

    if total_rows > 0:
        pct_cred = 100.0 * credible_count / total_rows
        summary_data.append({
            "Metric": f"Credible rows (≥{MIN_TRIPS:,} trips)",
            "Value": f"{credible_count:,} ({pct_cred:.1f}%)",
            "Notes": "Enough volume to trust risk ranking"
        })

    # Optional: risk_tier counts (only if column exists)
    if "risk_tier" in credible_stations.columns:
        for tier, note in [
            ("High Risk", "Premium pricing / targeted prevention"),
            ("Medium Risk", "Standard pricing"),
            ("Low Risk", "Discount / acquisition-friendly"),
        ]:
            n = int((credible_stations["risk_tier"].astype(str) == tier).sum())
            pct = (100.0 * n / credible_count) if credible_count > 0 else 0.0
            summary_data.append({
                "Metric": f"{tier} rows",
                "Value": f"{n:,} ({pct:.1f}%)",
                "Notes": note
            })

    # Hotspots (only if columns exist) — and handle True/False safely
    def _safe_true_count(df: pd.DataFrame, col: str) -> int:
        if col not in df.columns:
            return 0
        v = df[col]
        if v.dtype == bool:
            return int(v.sum())
        # tolerate strings like "True"/"False"
        return int(v.astype(str).str.lower().isin(["true", "1", "yes"]).sum())

    for col, label, note in [
        ("prevention_hotspot", "Prevention hotspots", "High exposure + high risk → safety pilots"),
        ("product_hotspot", "Product hotspots", "High exposure → strong sales reach"),
        ("acquisition_hotspot", "Acquisition hotspots", "High exposure + low risk → easy wins"),
    ]:
        if col in s.columns:
            n = _safe_true_count(s, col)
            summary_data.append({
                "Metric": label,
                "Value": f"{n:,}",
                "Notes": note
            })

    df_summary = pd.DataFrame(summary_data)
    display(df_summary)

    # Save summary (RUN_DIR safe)
    out_dir = None
    if "RUN_DIR" in globals() and RUN_DIR is not None:
        out_dir = Path(RUN_DIR)
    else:
        out_dir = Path(".").resolve()

    out_path = out_dir / "risk_executive_summary.csv"
    df_summary.to_csv(out_path, index=False)
    print(f"\nSaved executive summary to: {out_path}")
======================================================================
EXECUTIVE RISK SUMMARY FOR AXA
======================================================================
Metric Value Notes
0 Total station-period rows 72,466 mode=nyc
1 Credible rows (≥5,000 trips) 16,454 (22.7%) Enough volume to trust risk ranking
2 Prevention hotspots 2,557 High exposure + high risk → safety pilots
3 Product hotspots 14,494 High exposure → strong sales reach
4 Acquisition hotspots 10,223 High exposure + low risk → easy wins
Saved executive summary to: /home/hamzeh-khanpour/Desktop/CITIBIK-main/summaries/y2019_2020_2023_2025_m1_2_3_4_5_6_7_8_9_10_11_12_modenyc/risk_executive_summary.csv